library(haven)
library(MASS)
library(cshapes)
library(dplyr)
library(ggplot2)
library(boot)
library(rworldmap)
library(RColorBrewer)
library(countrycode)
library(BMS)
library(QuantPsyc)



#Importing data
main.data <- read_dta("replication_data.dta")

#FIGURE 1#########################

##### NOTE########:
# Creating  this plot takes upto 5 hours with a 2016 Macbook Air, on average. Changing "1000" on line 60 to, say, 100, will reduce the time needed, but also produce slightly different results
##################

lm.gdp.form <- refugees ~  log_bdeadhigh + log_gdppc_gleditsch + 
  territoryincompatibility + internationalization + polity2 +  interstatewar +     
  genocide + log_population_gleditsch + yearinconflict + yearinconflictSq+
  countries_in500km + gdppc_weighted 
lm.polity.form <- refugees ~ log_bdeadhigh + log_gdppc_gleditsch + 
  territoryincompatibility + internationalization + polity2 +  interstatewar +
  genocide + log_population_gleditsch +  yearinconflict + yearinconflictSq +
  countries_in500km + polity_weighted 

gdppc.model <- glm.nb(lm.gdp.form, data=main.data)
polity.model <- glm.nb(lm.polity.form, data=main.data)

to.estimate <- main.data %>%
  summarize(log_bdeadhigh=median(log_bdeadhigh, na.rm=T),
            log_gdppc_gleditsch=median(log_gdppc_gleditsch, na.rm=T),
            territoryincompatibility=0,
            internationalization=0,
            polity2=median(polity2, na.rm=T),
            interstatewar=0,
            genocide=0,
            log_population_gleditsch=median(log_population_gleditsch, na.rm=T),
            yearinconflict=median(yearinconflict, na.rm = T),
            yearinconflictSq = median(yearinconflictSq, na.rm = T),
            countries_in500km=median(countries_in500km, na.rm=T),
            gdppc_weighted=median(gdppc_weighted, na.rm=T),
            polity_weighted=median(polity_weighted, na.rm=T))


output <- variable <- upper <- lower <- NULL

#territory incompatibility
change.data <- rbind(to.estimate, to.estimate)
change.data$territoryincompatibility[2] <- 1
change.data$predicted <- predict(polity.model, type='response', newdata = change.data)
output <-c(output, change.data$predicted[2]-change.data$predicted[1])
variable <- c(variable, "Territory Incompatibility")
change.data$predicted <- NULL
boot.output <- NULL
set.seed(123)
for(i in 1:1000){
  tryCatch({
    data.boot <- sample_n(main.data, nrow(main.data), replace = T)
    model.boot <- glm.nb(lm.polity.form, data=data.boot)
    change.data$predicted <- predict(model.boot, type='response', newdata = change.data)
    boot.output <- c(boot.output, change.data$predicted[2]-change.data$predicted[1])
    data.boot <- model.boot <- NULL
    change.data$predicted <- NULL
    print(i)
  }, error=function(e){})
}
lower <- c(lower, quantile(boot.output, 0.025))
upper <- c(upper, quantile(boot.output, 0.975))
change.data <- boot.output <- NULL  

#internationalization
change.data <- rbind(to.estimate, to.estimate)
change.data$internationalization[2] <- 1
change.data$predicted <- predict(polity.model, type='response', newdata = change.data)
output <-c(output, change.data$predicted[2]-change.data$predicted[1])
variable <- c(variable, "Internationalization")
change.data$predicted <- NULL
boot.output <- NULL
set.seed(123)
for(i in 1:1000){
  tryCatch({
    data.boot <- sample_n(main.data, nrow(main.data), replace = T)
    model.boot <- glm.nb(lm.polity.form, data=data.boot)
    change.data$predicted <- predict(model.boot, type='response', newdata = change.data)
    boot.output <- c(boot.output, change.data$predicted[2]-change.data$predicted[1])
    data.boot <- model.boot <- NULL
    change.data$predicted <- NULL
    print(i)
  }, error=function(e){})
}
lower <- c(lower, quantile(boot.output, 0.025))
upper <- c(upper, quantile(boot.output, 0.975))
change.data <- boot.output <- NULL  

#interstatewar
change.data <- rbind(to.estimate, to.estimate)
change.data$interstatewar[2] <- 1
change.data$predicted <- predict(polity.model, type='response', newdata = change.data)
output <-c(output, change.data$predicted[2]-change.data$predicted[1])
variable <- c(variable, "Interstate War")
change.data$predicted <- NULL
boot.output <- NULL
set.seed(123)
for(i in 1:1000){
  tryCatch({
    data.boot <- sample_n(main.data, nrow(main.data), replace = T)
    model.boot <- glm.nb(lm.polity.form, data=data.boot)
    change.data$predicted <- predict(model.boot, type='response', newdata = change.data)
    boot.output <- c(boot.output, change.data$predicted[2]-change.data$predicted[1])
    data.boot <- model.boot <- NULL
    change.data$predicted <- NULL
    print(i)
  }, error=function(e){})
}
lower <- c(lower, quantile(boot.output, 0.025))
upper <- c(upper, quantile(boot.output, 0.975))
change.data <- boot.output <- NULL  

#genocide
change.data <- rbind(to.estimate, to.estimate)
change.data$genocide[2] <- 1
change.data$predicted <- predict(polity.model, type='response', newdata = change.data)
output <-c(output, change.data$predicted[2]-change.data$predicted[1])
variable <- c(variable, "Genocide")
change.data$predicted <- NULL
boot.output <- NULL
set.seed(123)
for(i in 1:1000){
  tryCatch({
    data.boot <- sample_n(main.data, nrow(main.data), replace = T)
    model.boot <- glm.nb(lm.polity.form, data=data.boot)
    change.data$predicted <- predict(model.boot, type='response', newdata = change.data)
    boot.output <- c(boot.output, change.data$predicted[2]-change.data$predicted[1])
    data.boot <- model.boot <- NULL
    change.data$predicted <- NULL
    print(i)
  }, error=function(e){})
}
lower <- c(lower, quantile(boot.output, 0.025))
upper <- c(upper, quantile(boot.output, 0.975))
change.data <- boot.output <- NULL  

#battle deaths
change.data <- rbind(to.estimate, to.estimate)
change.data$log_bdeadhigh[1] <- quantile(main.data$log_bdeadhigh, na.rm=T, 0.25)
change.data$log_bdeadhigh[2] <- quantile(main.data$log_bdeadhigh, na.rm=T, 0.75)
change.data$predicted <- predict(polity.model, type='response', newdata = change.data)
output <-c(output, change.data$predicted[2]-change.data$predicted[1])
variable <- c(variable, "Battle Deaths")
change.data$predicted <- NULL
boot.output <- NULL
set.seed(123)
for(i in 1:1000){
  tryCatch({
    data.boot <- sample_n(main.data, nrow(main.data), replace = T)
    model.boot <- glm.nb(lm.polity.form, data=data.boot)
    change.data$predicted <- predict(model.boot, type='response', newdata = change.data)
    boot.output <- c(boot.output, change.data$predicted[2]-change.data$predicted[1])
    data.boot <- model.boot <- NULL
    change.data$predicted <- NULL
    print(i)
  }, error=function(e){})
}
lower <- c(lower, quantile(boot.output, 0.025))
upper <- c(upper, quantile(boot.output, 0.975))
change.data <- boot.output <- NULL   

#gddpc
change.data <- rbind(to.estimate, to.estimate)
change.data$log_gdppc_gleditsch[1] <- quantile(main.data$log_gdppc_gleditsch, na.rm = T, 0.25)
change.data$log_gdppc_gleditsch[2] <- quantile(main.data$log_gdppc_gleditsch, na.rm = T, 0.75)
change.data$predicted <- predict(polity.model, type='response', newdata = change.data)
output <-c(output, change.data$predicted[2]-change.data$predicted[1])
variable <- c(variable, "GDPPC")
change.data$predicted <- NULL
boot.output <- NULL
set.seed(123)
for(i in 1:1000){
  tryCatch({
    data.boot <- sample_n(main.data, nrow(main.data), replace = T)
    model.boot <- glm.nb(lm.polity.form, data=data.boot)
    change.data$predicted <- predict(model.boot, type='response', newdata = change.data)
    boot.output <- c(boot.output, change.data$predicted[2]-change.data$predicted[1])
    data.boot <- model.boot <- NULL
    change.data$predicted <- NULL
    print(i)
  }, error=function(e){})
}
lower <- c(lower, quantile(boot.output, 0.025))
upper <- c(upper, quantile(boot.output, 0.975))
change.data <- boot.output <- NULL  

#polity
change.data <- rbind(to.estimate, to.estimate)
change.data$polity2[1] <- quantile(main.data$polity2, 0.25, na.rm = T)
change.data$polity2[2] <- quantile(main.data$polity2, 0.75, na.rm = T)
change.data$predicted <- predict(polity.model, type='response', newdata = change.data)
output <-c(output, change.data$predicted[2]-change.data$predicted[1])
variable <- c(variable, "Polity")
change.data$predicted <- NULL
boot.output <- NULL
set.seed(123)
for(i in 1:1000){
  tryCatch({
    data.boot <- sample_n(main.data, nrow(main.data), replace = T)
    model.boot <- glm.nb(lm.polity.form, data=data.boot)
    change.data$predicted <- predict(model.boot, type='response', newdata = change.data)
    boot.output <- c(boot.output, change.data$predicted[2]-change.data$predicted[1])
    data.boot <- model.boot <- NULL
    change.data$predicted <- NULL
    print(i)
  }, error=function(e){})
}
lower <- c(lower, quantile(boot.output, 0.025))
upper <- c(upper, quantile(boot.output, 0.975))
change.data <- boot.output <- NULL  

#population
change.data <- rbind(to.estimate, to.estimate)
change.data$log_population_gleditsch[1] <- quantile(main.data$log_population_gleditsch, na.rm = T, 0.25)
change.data$log_population_gleditsch[2] <- quantile(main.data$log_population_gleditsch, na.rm = T, 0.75)
change.data$predicted <- predict(polity.model, type='response', newdata = change.data)
output <-c(output, change.data$predicted[2]-change.data$predicted[1])
variable <- c(variable, "Population")
change.data$predicted <- NULL
boot.output <- NULL
set.seed(123)
for(i in 1:1000){
  tryCatch({
    data.boot <- sample_n(main.data, nrow(main.data), replace = T)
    model.boot <- glm.nb(lm.polity.form, data=data.boot)
    change.data$predicted <- predict(model.boot, type='response', newdata = change.data)
    boot.output <- c(boot.output, change.data$predicted[2]-change.data$predicted[1])
    data.boot <- model.boot <- NULL
    change.data$predicted <- NULL
    print(i)
  }, error=function(e){})
}
lower <- c(lower, quantile(boot.output, 0.025))
upper <- c(upper, quantile(boot.output, 0.975))
change.data <- boot.output <- NULL   

#year in conflict
change.data <- rbind(to.estimate, to.estimate)
change.data$yearinconflict[1] <- quantile(main.data$yearinconflict, na.rm = T, 0.25)
change.data$yearinconflict[2] <- quantile(main.data$yearinconflict, na.rm = T, 0.75)
change.data$yearinconflictSq[1] <- quantile(main.data$yearinconflictSq, na.rm = T, 0.25)
change.data$yearinconflictSq[2] <- quantile(main.data$yearinconflictSq, na.rm = T, 0.75)
change.data$predicted <- predict(polity.model, type='response', newdata = change.data)
output <-c(output, change.data$predicted[2]-change.data$predicted[1])
variable <- c(variable, "Year in Conflict")
change.data$predicted <- NULL
boot.output <- NULL
set.seed(123)
for(i in 1:1000){
  tryCatch({
    data.boot <- sample_n(main.data, nrow(main.data), replace = T)
    model.boot <- glm.nb(lm.polity.form, data=data.boot)
    change.data$predicted <- predict(model.boot, type='response', newdata = change.data)
    boot.output <- c(boot.output, change.data$predicted[2]-change.data$predicted[1])
    data.boot <- model.boot <- NULL
    change.data$predicted <- NULL
    print(i)
  }, error=function(e){})
}
lower <- c(lower, quantile(boot.output, 0.025))
upper <- c(upper, quantile(boot.output, 0.975))
change.data <- boot.output <- NULL  

#countries in 500 km
change.data <- rbind(to.estimate, to.estimate)
change.data$countries_in500km[1] <- quantile(main.data$countries_in500km, na.rm = T, 0.25)
change.data$countries_in500km[2] <- quantile(main.data$countries_in500km, na.rm = T, 0.75)
change.data$predicted <- predict(polity.model, type='response', newdata = change.data)
output <-c(output, change.data$predicted[2]-change.data$predicted[1])
variable <- c(variable, "Neighbors")
change.data$predicted <- NULL
boot.output <- NULL
set.seed(123)
for(i in 1:1000){
  tryCatch({
    data.boot <- sample_n(main.data, nrow(main.data), replace = T)
    model.boot <- glm.nb(lm.polity.form, data=data.boot)
    change.data$predicted <- predict(model.boot, type='response', newdata = change.data)
    boot.output <- c(boot.output, change.data$predicted[2]-change.data$predicted[1])
    data.boot <- model.boot <- NULL
    change.data$predicted <- NULL
    print(i)
  }, error=function(e){})
}
lower <- c(lower, quantile(boot.output, 0.025))
upper <- c(upper, quantile(boot.output, 0.975))
change.data <- boot.output <- NULL  

#gdppc weighted
change.data <- rbind(to.estimate, to.estimate)
change.data$gdppc_weighted[1] <- quantile(main.data$gdppc_weighted, na.rm = T, 0.25)
change.data$gdppc_weighted[2] <- quantile(main.data$gdppc_weighted, na.rm = T, 0.75)
change.data$predicted <- predict(gdppc.model, type='response', newdata = change.data)
output <-c(output, change.data$predicted[2]-change.data$predicted[1])
variable <- c(variable,'W_GDPPC')
change.data$predicted <- NULL
boot.output <- NULL
set.seed(123)
for(i in 1:1000){
  tryCatch({
    data.boot <- sample_n(main.data, nrow(main.data), replace = T)
    model.boot <- glm.nb(lm.gdp.form, data=data.boot)
    change.data$predicted <- predict(model.boot, type='response', newdata = change.data)
    boot.output <- c(boot.output, change.data$predicted[2]-change.data$predicted[1])
    data.boot <- model.boot <- NULL
    change.data$predicted <- NULL
    print(i)
  }, error=function(e){})
}
lower <- c(lower, quantile(boot.output, 0.025))
upper <- c(upper, quantile(boot.output, 0.975))
change.data <- boot.output <- NULL   

#polity weighted
change.data <- rbind(to.estimate, to.estimate)
change.data$polity_weighted[1] <- quantile(main.data$polity_weighted, na.rm = T, 0.25)
change.data$polity_weighted[2] <- quantile(main.data$polity_weighted, na.rm = T, 0.75)
change.data$predicted <- predict(polity.model, type='response', newdata = change.data)
output <-c(output, change.data$predicted[2]-change.data$predicted[1])
variable <- c(variable, "W_Polity")
change.data$predicted <- NULL
boot.output <- NULL
set.seed(123)
for(i in 1:1000){
  tryCatch({
    data.boot <- sample_n(main.data, nrow(main.data), replace = T)
    model.boot <- glm.nb(lm.polity.form, data=data.boot)
    change.data$predicted <- predict(model.boot, type='response', newdata = change.data)
    boot.output <- c(boot.output, change.data$predicted[2]-change.data$predicted[1])
    data.boot <- model.boot <- NULL
    change.data$predicted <- NULL
    print(i)
  }, error=function(e){})
}
lower <- c(lower, quantile(boot.output, 0.025))
upper <- c(upper, quantile(boot.output, 0.975))
change.data <- boot.output <- NULL  

results <- data.frame(variable, output, lower, upper)
results$output <- as.numeric(paste(results$output))
results <- arrange(results, output)

ggplot(results, aes(output, reorder(variable, output))) + geom_point(aes(shape=output<0)) + 
  labs(x="", y="") + theme_bw() + 
  theme(legend.position="none", plot.margin = margin(0.5, 0.6, 0.5, 0, "cm"), axis.text=element_text(size=12)) +
  geom_errorbarh(aes(xmin = lower, xmax = upper, height=.1))


rm(list=setdiff(ls(), "main.data"))

#FIGURE 2#########################################
base.form <- refugees ~ log_bdeadhigh + yearinconflict + yearinconflictSq
lit.form <- refugees ~ log_bdeadhigh + log_gdppc_gleditsch + territoryincompatibility +
  internationalization + polity2 +  interstatewar + genocide + 
  log_population_gleditsch + yearinconflict + yearinconflictSq
gdp.form <- refugees ~  log_bdeadhigh + log_gdppc_gleditsch + territoryincompatibility +
  internationalization + polity2 +  interstatewar +     
  genocide + log_population_gleditsch + yearinconflict + yearinconflictSq + 
  countries_in500km + gdppc_weighted 
polity.form <- refugees ~ log_bdeadhigh + log_gdppc_gleditsch + territoryincompatibility +
  internationalization + polity2 +  interstatewar +
  genocide + log_population_gleditsch +  yearinconflict + yearinconflictSq +
  countries_in500km + polity_weighted 
polity.gdp.form <- refugees ~  log_bdeadhigh + log_gdppc_gleditsch + territoryincompatibility + 
  internationalization + polity2 +  interstatewar + genocide + yearinconflictSq + 
  log_population_gleditsch +  yearinconflict + 
  countries_in500km + polity_weighted + gdppc_weighted

conflicts <- unique(main.data$conflictid)
actual <- gdp <- base <- polity.and.gdp <- polity <- lit  <- NULL
set.seed(123)
for(i in 1:length(conflicts)){
  if(i%%10 == 0) {print(i)}
  learn <- test <- NULL
  
  
  learn <- main.data[!main.data$conflictid %in% conflicts[i], ]
  test <- main.data[main.data$conflictid %in% conflicts[i], ]          
 
  lm.base <- lm.lit <- lm.polity <- lm.gdp <- lm.polity.gdp <- NULL
  try(lm.base <- glm.nb(base.form, data=learn))
  try(lm.lit <- glm.nb(lit.form, data=learn))
  try(lm.polity <- glm.nb(polity.form, data=learn))
  try(lm.gdp <- glm.nb(gdp.form, data=learn))
  try(lm.polity.gdp <- glm.nb(polity.gdp.form, data=learn))
  
  try(test$pred.base <- predict(lm.base, type='response', newdata = test))
  try(test$pred.lit <- predict(lm.lit, type='response', newdata = test))
  try(test$pred.polity <- predict(lm.polity, type='response', newdata = test))
  try(test$pred.gdp <- predict(lm.gdp, type='response', newdata = test))
  try(test$pred.polity.gdp <- predict(lm.polity.gdp, type='response', newdata = test))
  
  base <- c(base, test$pred.base) 
  lit <- c(lit, test$pred.lit)
  polity <- c(polity, test$pred.polity)
  gdp <- c(gdp, test$pred.gdp)
  polity.and.gdp <- c(polity.and.gdp, test$pred.polity.gdp)
  actual <- c(actual, test$refugees)
  
}

results.base <- abs(base - actual)
results.lit <- abs(lit - actual)
results.gdp <- abs(gdp - actual)
results.polity <- abs(polity - actual)
results.polity.gdp <- abs(polity.and.gdp - actual)

boot.median <- function(data, indices){
  d <- data[indices]
  return(median(d))
}

b1.index <- 0
ups <- lows <- NULL
for(this.data in list(results.base,
                      results.lit,
                      results.polity,
                      results.gdp,
                      results.polity.gdp
)){          
  b1.index <- b1.index+1
  print(b1.index)
  results <- boot(data=this.data, statistic=boot.median, R=2000)
  ci.95.low <- unlist(boot.ci(results, type="bca"))$bca4
  ci.95.high <- unlist(boot.ci(results, type="bca"))$bca5
  ups <- c(ups, ci.95.high)
  lows <- c(lows, ci.95.low)
}


median.error<- c(median(results.base),
                 median(results.lit),
                 median(results.polity),
                 median(results.gdp),
                 median(results.polity.gdp))
models <- c("Base", "Literature", "W_Polity", "W_GDPPC", "W_GDPPC + W_Polity")
output_plot <- data.frame(matrix(c(models, median.error, ups, lows), ncol=4))
colnames(output_plot) <- c("model", "median.error", "up.ci", "low.ci")
output_plot$median.error <- as.numeric(paste(output_plot$median.error))
output_plot$up.ci <- as.numeric(paste(output_plot$up.ci))
output_plot$low.ci <- as.numeric(paste(output_plot$low.ci))


ggplot(output_plot, aes(median.error, reorder(models, median.error))) + geom_point() + 
  labs(y="", x="Median Absolute Error") + theme_bw() +
  theme(legend.position="none", plot.margin = margin(0.5, 0.6, 0.5, 0, "cm"), axis.text=element_text(size=12)) +  
  geom_errorbarh(aes(xmin = low.ci, xmax = up.ci, height=.1))

rm(list=setdiff(ls(), "main.data"))

#FIGURE 3A######################################
conflicts <- unique(main.data$conflictid)
set.seed(123)
variables <- c("log_bdeadhigh",
               "log_gdppc_gleditsch",
               "territoryincompatibility",
               "internationalization",
               "polity2",
               "interstatewar",
               "genocide",
               "log_population_gleditsch",
               "yearinconflict",
               "countries_in500km",
               "gdppc_weighted"
               )
results <- data.frame(v1 = NULL, perf = NULL)

for(v1 in (length(variables)+1):1){
  print(paste('Removing', variables[c(v1)]))     
  means.reduced <- actual <- NULL
  
  
  for(i in 1:length(conflicts)){
    if(i%%10 == 0) {print(i)}
    learn <- test <- NULL
    
    learn <- main.data[!main.data$conflictid %in% conflicts[i], ]
    test <- main.data[main.data$conflictid %in% conflicts[i], ]          
    
    reduced.model <- as.formula(paste("refugees ~ ", paste(variables[-v1], collapse='+')))
    lm.reduced <- NULL
    try(lm.reduced <- glm.nb(reduced.model, data=learn))
    try(test$pred.reduced <- predict(lm.reduced, type='response', newdata = test))
    means.reduced <- c(means.reduced, mean(test$pred.reduced, na.rm=T))
    
    actual <- c(actual, mean(test$refugees, na.rm=T))
  }
  score <- median(abs(means.reduced - actual), na.rm=T)
  print(score)
  results <- rbind(results, cbind(variables[v1], score))
}

results$score <- as.numeric(paste(results$score))
results$V1 <- c("(Full Model)", "W_GDPPC", "Countries in 500km", 
                "Year in Conflict", "Population", "Genocide", "Interstate War", 
                "Polity", "Internationalization", "Territory Incompatibility", 
                "GDPPC", "Battle Deaths")
results$score <- results$score - results$score[results$V1 == "(Full Model)"]

ggplot(results, aes(score, reorder(V1, score))) + geom_point(aes(shape=score<0)) + 
  labs(x="", y="") + theme_bw() + scale_x_continuous(breaks=seq(-10000,25000, 5000)) + 
  theme(legend.position="none", plot.margin = margin(0.5, 0.6, 0.5, 0, "cm"), axis.text=element_text(size=14))

rm(list=setdiff(ls(), "main.data"))

#FIGURE 3B######################################
conflicts <- unique(main.data$conflictid)
set.seed(123)
variables <- c("log_bdeadhigh",
               "log_gdppc_gleditsch",
               "territoryincompatibility",
               "internationalization",
               "polity2",
               "interstatewar",
               "genocide",
               "log_population_gleditsch",
               "yearinconflict",
               "countries_in500km",
               "polity_weighted"
)
results <- data.frame(v1 = NULL, perf = NULL)

for(v1 in (length(variables)+1):1){
  print(paste('Removing', variables[c(v1)]))     
  means.reduced <- actual <- NULL
  
  
  for(i in 1:length(conflicts)){
    if(i%%10 == 0) {print(i)}
    learn <- test <- NULL
    
    learn <- main.data[!main.data$conflictid %in% conflicts[i], ]
    test <- main.data[main.data$conflictid %in% conflicts[i], ]          
    
    reduced.model <- as.formula(paste("refugees ~ ", paste(variables[-v1], collapse='+')))
    lm.reduced <- NULL
    try(lm.reduced <- glm.nb(reduced.model, data=learn))
    try(test$pred.reduced <- predict(lm.reduced, type='response', newdata = test))
    means.reduced <- c(means.reduced, mean(test$pred.reduced, na.rm=T))
    
    actual <- c(actual, mean(test$refugees, na.rm=T))
  }
  score <- median(abs(means.reduced - actual), na.rm=T)
  print(score)
  results <- rbind(results, cbind(variables[v1], score))
}

results$score <- as.numeric(paste(results$score))
results$V1 <- c("(Full Model)", "W_Polity", "Countries in 500km", 
                "Year in Conflict", "Population", "Genocide", "Interstate War", 
                "Polity", "Internationalization", "Territory Incompatibility", 
                "GDPPC", "Battle Deaths")
results$score <- results$score - results$score[results$V1 == "(Full Model)"]

ggplot(results, aes(score, reorder(V1, score))) + geom_point(aes(shape=score<0)) + 
  labs(x="", y="") + theme_bw() + scale_x_continuous(breaks=seq(-15000,25000, 10000)) + 
  theme(legend.position="none", plot.margin = margin(0.5, 0.6, 0.5, 0, "cm"), axis.text=element_text(size=14))


rm(list=setdiff(ls(), "main.data"))

#FIGURE 4A and 4B###################################
lit.model <- refugees ~ log_bdeadhigh + log_gdppc_gleditsch + 
  territoryincompatibility + internationalization + polity2 +  interstatewar +
  genocide + log_population_gleditsch + yearinconflict + yearinconflictSq +
  countries_in500km
polity.model <- refugees ~ log_bdeadhigh + log_gdppc_gleditsch + 
  territoryincompatibility + internationalization + polity2 +  interstatewar +
  genocide + log_population_gleditsch + yearinconflict + yearinconflictSq +
  countries_in500km + polity_weighted

countries <- unique(main.data$ccode)
actual <- polity.prediction <- polity.weighted <- year <- ccode <- lit.prediction <- NULL

set.seed(123)
for(i in 1:length(countries)){ 
  if(i%%10 == 0) {print(i)}
  learn <- test <- NULL
  
  learn <- main.data[!main.data$ccode==countries[i], ]
  test <- main.data[main.data$ccode==countries[i], ]        
  lm.lit <- lm.gdppc <- lm.polity <- NULL
  try(lm.lit <- glm.nb(lit.model, data=learn))
  try(lm.polity <- glm.nb(polity.model, data=learn))
  
  try(test$pred.lit <- predict(lm.lit, type='response', newdata = test))
  try(test$pred.polity <- predict(lm.polity, type='response', newdata = test))
  
  lit.prediction <- c(lit.prediction, test$pred.lit)
  polity.prediction <-c(polity.prediction, test$pred.polity)
  actual <- c(actual, test$refugees)
  polity.weighted <- c(polity.weighted, test$polity_weighted)
  year <- c(year, test$year)
  ccode <- c(ccode, test$ccode)
}

out.of.sample <- data.frame(year, ccode, actual, lit.prediction, polity.prediction, polity.weighted)
out.of.sample <- na.omit(out.of.sample)
tomerge <- out.of.sample %>%
  group_by(ccode)%>%
  summarise(year=max(year), tomerge=1)
out.of.sample <- merge(out.of.sample, tomerge, by=c("ccode", "year"))
out.of.sample$polity.weighted.error <- out.of.sample$actual - out.of.sample$polity.prediction
out.of.sample$lit.error <- out.of.sample$actual - out.of.sample$lit.prediction
out.of.sample$country <- countrycode(out.of.sample$ccode, "cown", "country.name")
out.of.sample$country <- ifelse(out.of.sample$country=="Myanmar (Burma)", "Myanmar", out.of.sample$country)
out.of.sample$country <- ifelse(out.of.sample$country=="Congo - Kinshasa", "Congo (Kinshasa)", out.of.sample$country)
out.of.sample$country <- ifelse(out.of.sample$country=="Congo - Brazzaville", "Congo (Brazzaville)", out.of.sample$country)
out.of.sample$country <- ifelse(out.of.sample$country=="Trinidad & Tobago", "Trinidad and Tobago", out.of.sample$country)
out.of.sample$country <- ifelse(out.of.sample$country=="Bosnia & Herzegovina", "Bosnia and Herz.", out.of.sample$country)
out.of.sample$country <- ifelse(out.of.sample$country=="Central African Republic", "Central African Rep.", out.of.sample$country)
out.of.sample$country <- ifelse(out.of.sample$country=="C?te d'Ivoire", "Ivory Coast", out.of.sample$country)
out.of.sample$country <- ifelse(out.of.sample$country=="Yemen Arab Republic", "Yemen", out.of.sample$country)

#FIGURE 4A####
sPDF <- joinCountryData2Map(out.of.sample, joinCode="NAME", nameJoinColumn = "country")
catMethod <- c(as.numeric(quantile(out.of.sample$lit.error, c(seq(0, 1, 0.1)))))
colourPalette <- c(brewer.pal(9,"Reds")[9:3], brewer.pal(9,"Blues")[7:9])


mapParams <- mapCountryData(sPDF,nameColumnToPlot="lit.error", 
                            catMethod=catMethod,
                            mapTitle="",
                            colourPalette=colourPalette,
                            missingCountryCol="grey", 
                            addLegend="FALSE"
)
do.call(addMapLegend, c( mapParams, legendLabels="limits", 
                         labelFontSize=0.7,legendShrink=0.7,
                         legendMar=4, legendWidth=0.6,
                         legendIntervals="page"))

#FIGURE 4B####
catMethod_polity <- quantile(out.of.sample$polity.weighted, seq(0,1, 0.1))
mapParams <- mapCountryData(sPDF,nameColumnToPlot="polity.weighted", 
                            catMethod=catMethod_polity,
                            mapTitle="",
                            colourPalette=colourPalette,
                            missingCountryCol="grey", 
                            addLegend="FALSE"
)
do.call( addMapLegend, c( mapParams, legendLabels="limits", 
                          labelFontSize=0.7,legendShrink=0.7,
                          legendMar=4, legendWidth=0.6,
                          legendIntervals="page"))

rm(list=setdiff(ls(), "main.data"))



#FIGURE 5A##################################
set.seed(123)
c1 <- as.data.frame(with(main.data, 
                         Make.Z(data.frame(refugees, log_bdeadhigh,
                                           territoryincompatibility,
                                           internationalization, polity2 , 
                                           log_gdppc_gleditsch, interstatewar, 
                                           genocide, log_population_gleditsch,
                                           yearinconflict, countries_in500km, 
                                           gdppc_weighted
                         ))))
colnames(c1) <- c("Refugees", "Battle deaths", "Territory Incompatibility", "Internationalization", "Polity", "GDPPC", "Interstate War", "Genocide", "Population", "Year in Conflict", "Countries in 500 km", "W_GDPPC")

bms1 <- bms(c1, mprior = "uniform", g = "UIP", user.int = F, nmodel = 2048)
image(bms1[1:1000], main='') 

rm(list=setdiff(ls(), "main.data"))

#FIGURE 5B####################################
set.seed(123)
c1 <- as.data.frame(with(main.data, 
                         Make.Z(data.frame(refugees, log_bdeadhigh,
                                           territoryincompatibility,
                                           internationalization, polity2 , 
                                           log_gdppc_gleditsch, interstatewar, 
                                           genocide, log_population_gleditsch,
                                           yearinconflict, countries_in500km, 
                                           polity_weighted
                         ))))
colnames(c1) <- c("Refugees", "Battle deaths", "Territory Incompatibility", "Internationalization", "Polity", "GDPPC", "Interstate War", "Genocide", "Population", "Year in Conflict", "Countries in 500 km", "W_Polity")

bms1 <- bms(c1, mprior = "uniform", g = "UIP", user.int = F, nmodel = 2048)
image(bms1[1:1000], main='') 

rm(list=setdiff(ls(), "main.data"))



#FIGURE A1#####################################
base.form <- refugees ~  log_bdeadhigh + yearinconflict 
lit.form <- refugees ~  log_bdeadhigh + log_gdppc_gleditsch + territoryincompatibility + 
  internationalization + polity2 +  interstatewar + genocide + log_population_gleditsch +  
  yearinconflict + countries_in500km 
main.form <- refugees ~  log_bdeadhigh + log_gdppc_gleditsch + territoryincompatibility + 
  internationalization + polity2 +  interstatewar + genocide + log_population_gleditsch + 
  yearinconflict + countries_in500km +  gdppc_weighted
main.polity.form <- refugees ~  log_bdeadhigh + log_gdppc_gleditsch + territoryincompatibility + 
  internationalization + polity2 +  interstatewar + genocide + log_population_gleditsch +  
  yearinconflict  + countries_in500km +  polity_weighted
border.gdppc.form <-  refugees ~  log_bdeadhigh + log_gdppc_gleditsch + territoryincompatibility + 
  internationalization + polity2 +  interstatewar + genocide + log_population_gleditsch + 
  yearinconflict  +  countries_in500km + gdppc_weighted_border_16
border.polity.form <-  refugees ~  log_bdeadhigh + log_gdppc_gleditsch + territoryincompatibility + 
  internationalization + polity2 +  interstatewar + genocide + log_population_gleditsch + 
  yearinconflict  + countries_in500km + polity_weighted_border_16
mean.gdppc.form <-  refugees ~  log_bdeadhigh + log_gdppc_gleditsch + territoryincompatibility + 
  internationalization + polity2 +  interstatewar + genocide + log_population_gleditsch +  
  yearinconflict  +  countries_in500km + mean_500km_gdppc
mean.polity.form <-  refugees ~  log_bdeadhigh + log_gdppc_gleditsch + territoryincompatibility + 
  internationalization + polity2 +  interstatewar + genocide + log_population_gleditsch +  
  yearinconflict +  countries_in500km + mean_500km_polity


conflicts <- unique(main.data$conflictid)
actual <- means.main <- means.main.polity <- means.gdppc.border <- means.polity.border <- means.gdppc.mean <- means.polity.mean <- means.lit <- means.base <- NULL
set.seed(123)
for(i in 1:length(conflicts)){
  if(i%%10 == 0) {print(i)}
  learn <- test <- NULL
  learn <- main.data[!main.data$conflictid %in% conflicts[i], ]
  test <- main.data[main.data$conflictid %in% conflicts[i], ]          
  lm.main <- lm.main.polity <- lm.base <- lm.polity.mean <- lm.gdppc.mean <- lm.polity.border <- lm.gdppc.border <- lm.lit <- NULL
  
  try(lm.main <- glm.nb(main.form, data=learn))
  try(lm.main.polity <- glm.nb(main.polity.form, data=learn))
  try(lm.base <- glm.nb(base.form, data=learn))
  try(lm.lit <- glm.nb(lit.form, data=learn))
  try(lm.gdppc.border <- glm.nb(border.gdppc.form, data=learn))
  try(lm.polity.border <- glm.nb(border.polity.form, data=learn))
  try(lm.gdppc.mean <- glm.nb(mean.gdppc.form, data=learn))
  try(lm.polity.mean <- glm.nb(mean.polity.form, data=learn))
  
  
  try(test$pred.main <- predict(lm.main, type='response', newdata = test))
  try(test$pred.main.polity <- predict(lm.main.polity, type='response', newdata = test))
  try(test$pred.base <- predict(lm.base, type='response', newdata = test))
  try(test$pred.lit <- predict(lm.lit, type='response', newdata = test))
  try(test$pred.gdppc.border <- predict(lm.gdppc.border, type='response', newdata = test))
  try(test$pred.polity.border <- predict(lm.polity.border, type='response', newdata = test))
  try(test$pred.gdppc.mean <- predict(lm.gdppc.mean, type='response', newdata = test))
  try(test$pred.polity.mean <- predict(lm.polity.mean, type='response', newdata = test))
  
  means.main <- c(means.main, test$pred.main)
  means.main.polity <- c(means.main.polity, test$pred.main.polity)
  means.base <- c(means.base, test$pred.base)
  means.lit <- c(means.lit, test$pred.lit)
  means.gdppc.border <- c(means.gdppc.border, test$pred.gdppc.border)
  means.polity.border <- c(means.polity.border, test$pred.polity.border)
  means.gdppc.mean <- c(means.gdppc.mean, test$pred.gdppc.mean)
  means.polity.mean <- c(means.polity.mean, test$pred.polity.mean)
  actual <- c(actual, test$refugees)
}

#For some observations, gdppc border is problematic. Rerun it.
means.gdppc.border <-  NULL
set.seed(123)
for(i in 1:length(conflicts)){
  if(i%%10 == 0) {print(i)}
  learn <- test <- NULL
  learn <- main.data[!main.data$conflictid %in% conflicts[i], ]
  test <- main.data[main.data$conflictid %in% conflicts[i], ]          
  lm.gdppc.border <- NULL
  
  try(lm.gdppc.border <- glm.nb(border.gdppc.form, data=learn))
  
  try(test$pred.gdppc.border <- predict(lm.gdppc.border, type='response', newdata = test))
  if(!is.null(lm.gdppc.border)){
    means.gdppc.border <- c(means.gdppc.border, test$pred.gdppc.border)
  }else{
    means.gdppc.border <- c(means.gdppc.border, rep(NA, nrow(test)))
  }
}

par(mar=c(16,3,2,1), pty='s')
results.main <- abs(means.main - actual)
results.base <- abs(means.base - actual)
results.lit <- abs(means.lit - actual)
results.main.polity <- abs(means.main.polity - actual)
results.gdppc.border <- abs(means.gdppc.border - actual)
results.polity.border <- abs(means.polity.border - actual)
results.gdppc.mean <- abs(means.gdppc.mean - actual)
results.polity.mean <- abs(means.polity.mean - actual)

b1 <- barplot(
  c(
    median(results.base, na.rm=T),
    median(results.lit, na.rm=T),
    median(results.gdppc.mean, na.rm=T),
    median(results.polity.mean, na.rm=T),
    median(results.gdppc.border, na.rm=T),
    median(results.polity.border, na.rm=T),
    median(results.main.polity, na.rm=T),
    median(results.main, na.rm=T)
  ),
  main = '', las=1, ylim=c(20000, 120000),  xpd=F, col='white') 

  
axis(1, at=c(0.2, 10.2), labels=FALSE, lwd.ticks=0)
axis(1, labels = c("Base","Literature",  "Neighbor GDPPC Mean","Neighbor Polity Mean", expression('W'[it]*'GDPPC - N Border'),  expression('W'[it]*'Polity - N Border'), expression('W'[it]*'Polity - Distance'),  expression('W'[it]*'GDPPC - Distance')),
     las = 2, cex.axis = 1.7, at = b1)
mtext('Median Absolute Error', 2, line=4, cex=1.7)

boot.median <- function(data, indices){
  d <- data[indices]
  return(median(d))
}

b1.index <- 0
ups <- lows <- NULL
for(this.data in list(results.base[!is.na(results.base)],
                      results.lit[!is.na(results.lit)],
                      results.gdppc.mean[!is.na(results.gdppc.mean)],
                      results.polity.mean[!is.na(results.polity.mean)],
                      results.polity.border[!is.na(results.polity.border)],
                      results.gdppc.border[!is.na(results.gdppc.border)],
                      results.main.polity[!is.na(results.main.polity)],
                      results.main[!is.na(results.main)]
)){          
  b1.index <- b1.index+1
  print(b1.index)
  results <- boot(data=this.data, statistic=boot.median, R=2000)
  ci.95.low <- unlist(boot.ci(results, type="bca"))$bca4
  ci.95.high <- unlist(boot.ci(results, type="bca"))$bca5
  ups <- c(ups, ci.95.high)
  lows <- c(lows, ci.95.low)
  CI.size <- ci.95.high - ci.95.low
  segments(b1[b1.index], median(this.data[!is.na(this.data)]) + CI.size/2, b1[b1.index], median(this.data[!is.na(this.data)]) - CI.size/2)
}     
